%matplotlib inline
%config InlineBackend.figure_format='retina'
import pandas as pd
import missingno as msno
import matplotlib.pyplot as plt
import folium
import seaborn as sns
from pandas_profiling import ProfileReport
import matplotlib as mpl
import scipy
import numpy as np
data = pd.read_csv("../data/aquastat.csv.gzip", compression="gzip")
data.head()
data.info()
data[["variable", "variable_full"]].drop_duplicates()
countries = data.country.unique()
data.country.nunique()
data.time_period.nunique()
time_periods = data.time_period.unique()
time_periods
def time_slice(df, time_period):
"""
On the given dataframe,
1. slice the data in the given time period.
2. Pivot the dataframe to get all the values of the 'variable' to a column
and populate the cell with the data in the 'value' column
3. Update the column name to the time_period
"""
df = df[df["time_period"] == time_period]
df = df.pivot(index="country", columns="variable", values="value")
df.columns.name = time_period
return df
time_slice(data, time_periods[0]).head()
def country_slice(df, country):
df = df[df.country == country]
df = df.pivot(index="variable", columns="time_period", values="value")
df.columns.name = country
return df
country_slice(data, countries[0]).head()
def variable_slice(df, variable):
df = df[df.variable == variable]
df = df.pivot(index="country", columns="time_period", values="value")
return df
variable_slice(data, 'total_pop').head()
data.region.unique()
simpler_regions = {
'World | Asia':'Asia',
'Americas | Central America and Caribbean | Central America': 'North America',
'Americas | Central America and Caribbean | Greater Antilles': 'North America',
'Americas | Central America and Caribbean | Lesser Antilles and Bahamas': 'North America',
'Americas | Northern America | Northern America': 'North America',
'Americas | Northern America | Mexico': 'North America',
'Americas | Southern America | Guyana':'South America',
'Americas | Southern America | Andean':'South America',
'Americas | Southern America | Brazil':'South America',
'Americas | Southern America | Southern America':'South America',
'World | Africa':'Africa',
'World | Europe':'Europe',
'World | Oceania':'Oceania'
}
data.region = data.region.apply(lambda x: simpler_regions[x])
data.region.unique()
def get_subregion(data, region):
return data[data.region == region]
recent = time_slice(data, time_periods[-1])
recent.head()
msno.bar(recent, labels=True)
msno.matrix(recent, labels=True)
msno.matrix(variable_slice(data, 'exploitable_total'), inline=False, sort='descending')
plt.xlabel("Time Period")
plt.ylabel("Country")
plt.title("Missing total exploitable water resources data across countries and time periods \n \n \n \n")
Possible reasons to have such missing data for exploitable resources are
A column with such few data points doesn't add any value in further analysis. Hence, dropping
data = data.loc[~data.variable.str.contains('exploitable'),:]
msno.matrix(variable_slice(data, 'national_rainfall_index'), inline=False, sort='descending')
plt.xlabel("Time Period")
plt.ylabel("Country")
plt.title("Missing total national_rainfall_index water resources data across countries and time periods \n \n \n \n")
from 2002, this data hasn't been reported at all. Hence dropping this
data = data.loc[~(data.variable == 'national_rainfall_index')]
north_america = get_subregion(data, 'North America')
msno.nullity_sort(time_slice(north_america, '2013-2017'), sort='descending')
Looking at North America data
north_america = get_subregion(data, 'North America')
msno.bar(msno.nullity_sort(time_slice(north_america, '2013-2017'), sort='descending').T, inline=False)
plt.title('Fraction of fields complete by country for North America \n \n');
Is there any pattern in the coutries with most missing data? What are the potential reasons for missing data?
# Bahamas location
folium.Map(location=[18.1160128,-77.8364762], tiles="CartoDB positron", zoom_start=5, width=1200, height=600)
Check what data is missing in Bahamas
msno.nullity_filter(country_slice(data, 'Bahamas').T, filter="bottom", p=0.1)
# JSON with co-ordinates for country boundries
geo_data_path = r'../data/world.json'
null_data = recent['agg_to_gdp'].notnull()*1
map = folium.Map(location=[48, -102], zoom_start=2)
map.choropleth(
geo_data=geo_data_path,
data=null_data,
columns=['country', 'agg_to_gdp'],
key_on='feature.properties.name', reset=True,
fill_color='GnBu', fill_opacity=1, line_opacity=0.2,
legend_name='Missing agricultural contribution to GDP data 2013-2017'
)
map
fig, ax = plt.subplots(figsize=(16,16))
sns.heatmap(data.groupby(['time_period', 'variable']).value.count().unstack().T, ax=ax)
plt.xticks(rotation=45);
plt.xlabel('Time period');
plt.ylabel('Variable');
plt.title('Number of countries with data reported for each variable over time');
# ProfileReport(time_slice(data, '2013-2017'))
Since its a continous variable, we are looking for the below aspects
recent[["total_pop", "urban_pop", "rural_pop"]].describe().astype(int)
We can see that the data is skewed.i.e., 50% quartile is closer to either 25% or 75% quartile.
Ex: for total_pop:
50% is 6227 units away from 25% and 17493 units away from 75%.
Also, there is a huge difference of 29295 units between the mean and the 50%.
Clearly the data is skewed.
The min value in rural_pop is negative. What does this imply?
recent.sort_values("rural_pop")[["total_pop", "urban_pop", "rural_pop"]].head()
From the glossary of the datasset, we have information that
$ rural pop = total pop - urban pop $
This validates the data we have above.
def time_series(df, country, variable):
"""
Returns the time series data for a given country & variable
1. Slice the data with matching country and variable
2. Drop years with no data
"""
series = df[(df.country==country) & (df.variable==variable)]
series = series.dropna()[['year_measured', 'value']]
# Convert years to type int and set as index
series.year_measured = series.year_measured.astype(int)
series.set_index('year_measured', inplace=True)
series.columns = [variable]
return series
time_series(data, 'Qatar', 'total_pop').join(time_series(data, 'Qatar', 'urban_pop')).join(time_series(data, 'Qatar', 'rural_pop'))
What do with the negative numbers of the rural population? Replace it with rough estimates from other sources?
Skewness: Measure of lack of symmetry in the data distribution
Kurtosis: Kurtosis is a statistical measure that defines how heavily the tails of a distribution differ from the tails of a normal distribution.
recent[["total_pop", "urban_pop", "rural_pop"]].apply(scipy.stats.skew)
skewness for normal distribution should be 0
left skewed is shown by the negative values
right skewed is shown by the positive values
So, here our data is right skewed
recent[["total_pop", "urban_pop", "rural_pop"]].apply(scipy.stats.kurtosis)
fig, ax = plt.subplots(figsize=(12,8))
ax.hist(recent.total_pop.values, bins=50)
ax.set_xlabel('Total population')
ax.set_ylabel('Number of countries')
ax.set_title('Distribution of population of countries 2013-2017')
The data is definetly skewed! How to handle?
# modularize the code foe plotting nulls geospatially
def plot_map(df, variable, time_period=None, log=False, legend_name=None, threshold_scale=None):
geo_path = r"../data/world.json"
legend_name = legend_name if legend_name else "%s for %s" %(variable, time_period)
if time_period:
df = time_slice(df, time_period).reset_index()
else:
df = df.reset_index()
if log:
df[variable] = df[variable].apply(np.log)
map = folium.Map(location=[35, -45], zoom_start=2, width=800, height=400)
map.choropleth(geo_data=geo_path,
data=df,
columns=['country', variable],
key_on='feature.properties.name', reset=True,
fill_color='PuBuGn', fill_opacity=0.7, line_opacity=0.2,
legend_name=legend_name,
threshold_scale=threshold_scale)
return map
plot_map(data, 'total_pop', '2013-2017', legend_name='Total population')
recent[['total_pop']].apply(np.log).apply(scipy.stats.skew)
The skewness reduced from 8.5 to -0.8. IT definetly reduced. But now we have a little left skew.
recent[['total_pop']].apply(np.log).apply(scipy.stats.kurtosis)
The kurtosis reduced from 76.92 to 1.08
def plot_hist(df, variable, bins=20, xlabel=None, by=None,
ylabel=None, title=None, logx=False, ax=None):
if not ax:
fig, ax = plt.subplots(figsize=(12,8))
if logx:
if df[variable].min() <=0:
df[variable] = df[variable] - df[variable].min() + 1
print('Warning: data <=0 exists, data transformed by %0.2g before plotting' % (- df[variable].min() + 1))
bins = np.logspace(np.log10(df[variable].min()),
np.log10(df[variable].max()), bins)
ax.set_xscale("log")
ax.hist(df[variable].dropna().values, bins=bins);
if xlabel:
ax.set_xlabel(xlabel);
if ylabel:
ax.set_ylabel(ylabel);
if title:
ax.set_title(title);
return ax
plot_hist(recent, 'total_pop', bins=25, logx=True,
xlabel='Log of total population', ylabel='Number of countries',
title='Distribution of total population of countries 2013-2017');
plot_map(data, 'total_pop', '2013-2017', legend_name='Log of total population', log=True)
We can see larger countries have larger populations - that makes sense... we may hypothesize that water availability may affect countries with higher population density rather than higher absolute populations.
recent['population_density'] = recent.total_pop.divide(recent.total_area)
plot_map(recent, 'population_density', legend_name='Population density',threshold_scale=[0,0.3,0.8, 1.8, 78])
# USA population over time
plt.plot(time_series(data, 'United States of America', 'total_pop'));
plt.xlabel('Year');
plt.ylabel('Population');
plt.title('United States population over time');
with sns.color_palette(sns.diverging_palette(220, 280, s=85, l=25, n=23)):
north_america = time_slice(get_subregion(data, 'North America'), '1958-1962').sort_values('total_pop').index.tolist()
for country in north_america:
plt.plot(time_series(data, country, 'total_pop'), label=country);
plt.xlabel('Year');
plt.ylabel('Population');
plt.title('North American populations over time');
plt.legend(loc=2,prop={'size':5});
This graph gives no insights, other than just the fact that North America is a big country.
So, normaliza again with the min, to see how much each country grows with refernce to its starting population
with sns.color_palette(sns.diverging_palette(220, 280, s=85, l=25, n=23)):
for country in north_america:
ts = time_series(data, country, 'total_pop')
ts['norm_pop'] = ts.total_pop/ts.total_pop.min()*100
plt.plot(ts['norm_pop'], label=country);
plt.xlabel('Year');
plt.ylabel('Percent increase in population');
plt.title('Percent increase in population from 1960 in North American countries');
plt.legend(loc=2,prop={'size':5});
north_america_pop = variable_slice(get_subregion(data, 'North America'), 'total_pop')
north_america_norm_pop = north_america_pop.div(north_america_pop.min(axis=1), axis=0)*100
north_america_norm_pop = north_america_norm_pop.loc[north_america]
fig, ax = plt.subplots(figsize=(16, 12));
sns.heatmap(north_america_norm_pop, ax=ax, cmap=sns.light_palette((214, 90, 60), input="husl", as_cmap=True));
plt.xticks(rotation=45);
plt.xlabel('Time period');
plt.ylabel('Country, ordered by population in 1960 (<- greatest to least ->)');
plt.title('Percent increase in population from 1960');
plot_hist(recent, 'total_renewable', bins=50,
xlabel='Total renewable water resources ($10^9 m^3/yr$)',
ylabel='Number of countries',
title='Distribution of total renewable water resources, 2013-2017');
plot_hist(recent, 'total_renewable', bins=50,
xlabel='Total renewable water resources ($10^9 m^3/yr$)',
ylabel='Number of countries', logx=True,
title='Log transformed distribution of total renewable water resources, 2013-2017');
north_america_renew = variable_slice(get_subregion(data, 'North America'), 'total_renewable')
fig, ax = plt.subplots(figsize=(16, 12));
sns.heatmap(north_america_renew, ax=ax, cmap=sns.light_palette((214, 90, 60), input="husl", as_cmap=True));
plt.xticks(rotation=45);
plt.xlabel('Time period');
plt.ylabel('Country, ordered by population in 1960 (<- greatest to least ->)');
plt.title('Percent increase in population from 1960');
Total renewable resources doesn't seem to change over time. But, better to check
north_america_renew.head()
subtract 1958-1962 values from each period and add up the results
north_america_renew.sub(north_america_renew.iloc[:,0], axis=0).sum()
Does this apply to the rest of the world?
renew = variable_slice(data, 'total_renewable')
renew.sub(renew.iloc[:,0], axis=0).sum()
NO!!!! Look at country
renew.sub(renew.iloc[:,0], axis=0).sum(axis=1).sort_values().head()
Bhutan changed
renew.sub(renew.iloc[:,0], axis=0).sum(axis=1).sort_values().tail(50)
Assess the relationships by
All our data is of continous type. Hence we need bivariate plotting for continous * continous. The below are the possible plots to consider
plt.scatter(recent.seasonal_variability, recent.gdp_per_capita)
plt.xlabel('Seasonal variability');
plt.ylabel('GDP per capita ($USD/person)');
def plot_scatter(df, x, y, xlabel=None, ylabel=None, title=None,
logx=False, logy=False, by=None, ax=None):
if not ax:
fig, ax = plt.subplots(figsize=(12, 10))
colors = mpl.rcParams['axes.prop_cycle'].by_key()['color']
if by:
groups = df.groupby(by)
for j, (name, group) in enumerate(groups):
ax.scatter(group[x], group[y], color=colors[j], label=name)
ax.legend()
else:
ax.scatter(df[x], df[y], color=colors[0])
if logx:
ax.set_xscale('log')
if logy:
ax.set_yscale('log')
ax.set_xlabel(xlabel if xlabel else x);
ax.set_ylabel(ylabel if ylabel else y);
if title:
ax.set_title(title);
return ax
svr = [recent.seasonal_variability.min(), recent.seasonal_variability.max()]
gdpr = [(recent.gdp_per_capita.min()), recent.gdp_per_capita.max()]
gdpbins = np.logspace(*np.log10(gdpr), 25)
g =sns.JointGrid(x="seasonal_variability", y="gdp_per_capita", data=recent, ylim=gdpr)
g.ax_marg_x.hist(recent.seasonal_variability, range=svr)
g.ax_marg_y.hist(recent.gdp_per_capita, range=gdpr, bins=gdpbins, orientation="horizontal")
g.plot_joint(plt.hexbin, gridsize=25)
ax = g.ax_joint
# ax.set_yscale('log')
g.fig.set_figheight(8)
g.fig.set_figwidth(9)
g =sns.JointGrid(x="seasonal_variability", y="gdp_per_capita", data=recent, ylim=gdpr)
g.ax_marg_x.hist(recent.seasonal_variability, range=svr)
g.ax_marg_y.hist(recent.gdp_per_capita, range=gdpr, bins=gdpbins, orientation="horizontal")
g.plot_joint(plt.scatter)
ax = g.ax_joint
ax.set_yscale('log')
g.fig.set_figheight(8)
g.fig.set_figwidth(9)
recent_corr = recent.corr().loc['gdp_per_capita'].drop(['gdp','gdp_per_capita'])
def conditional_bar(series, bar_colors=None, color_labels=None, figsize=(13,24),
xlabel=None, by=None, ylabel=None, title=None):
fig, ax = plt.subplots(figsize=figsize)
if not bar_colors:
bar_colors = mpl.rcParams['axes.prop_cycle'].by_key()['color'][0]
plt.barh(range(len(series)),series.values, color=bar_colors)
plt.xlabel('' if not xlabel else xlabel);
plt.ylabel('' if not ylabel else ylabel)
plt.yticks(range(len(series)), series.index.tolist())
plt.title('' if not title else title);
plt.ylim([-1,len(series)]);
if color_labels:
for col, lab in color_labels.items():
plt.plot([], linestyle='',marker='s',c=col, label= lab);
lines, labels = ax.get_legend_handles_labels();
ax.legend(lines[-len(color_labels.keys()):], labels[-len(color_labels.keys()):], loc='upper right');
plt.close()
return fig
bar_colors = ['#0055A7' if x else '#2C3E4F' for x in list(recent_corr.values < 0)]
color_labels = {'#0055A7':'Negative correlation', '#2C3E4F':'Positive correlation'}
conditional_bar(recent_corr.apply(np.abs), bar_colors, color_labels,
title='Magnitude of correlation with GDP per capita, 2013-2017',
xlabel='|Correlation|')
For non-linear relationships, we can try
plot_hist(recent, 'gdp_per_capita', xlabel='GDP per capita ($)',
ylabel='Number of countries',
title='Distribution of GDP per capita, 2013-2017');
plot_hist(recent, 'gdp_per_capita', xlabel='GDP per capita ($)', logx=True,
ylabel='Number of countries', bins=25,
title='Log Transformed Distribution of log GDP per capita, 2013-2017');
capita_bins = ['Very low', 'Low', 'Medium', 'High', 'Very high']
recent['gdp_bin'] = pd.qcut(recent.gdp_per_capita, 5, capita_bins)
bin_ranges = pd.qcut(recent.gdp_per_capita, 5).unique()
def plot_hist_bins(df, variable, bins=None, xlabel=None, by=None,
ylabel=None, title=None, logx=False, ax=None):
if not ax:
fig, ax = plt.subplots(figsize=(12, 8))
if logx:
bins = np.logspace(np.log10(df[variable].min()),
np.log10(df[variable].max()), bins)
ax.set_xscale("log")
if by:
if type(df[by].unique()) == pd.core.arrays.categorical.Categorical:
cats = df[by].unique().categories.tolist()
else:
cats = df[by].unique().tolist()
for cat in cats:
to_plot = df[df[by] == cat][variable].dropna()
ax.hist(to_plot, bins=bins);
else:
ax.hist(df[variable].dropna().values, bins=bins);
if xlabel:
ax.set_xlabel(xlabel);
if ylabel:
ax.set_ylabel(ylabel);
if title:
ax.set_title(title);
return ax
plot_hist_bins(recent, 'gdp_per_capita', xlabel='GDP per capita ($)', logx=True,
ylabel='Number of countries', bins=25, by='gdp_bin',
title='Distribution of log GDP per capita, 2013-2017')
Now that we have a CATEGORICAL X CONTINUOUS analysis, we can look at the distribution of a few variables for each gdp group.
recent[['gdp_bin','total_pop_access_drinking']].boxplot(by='gdp_bin');
# plt.ylim([0,100000]);
plt.title('Distribution of percent of total population with access to drinking water across gdp per capita categories');
plt.xlabel('GDP per capita quintile');
plt.ylabel('Total population of country');
def mult_boxplots(df, variable, category,
xlabel=None, ylabel=None, title=None,
ylim=None):
df[[variable, category]].boxplot(by=category);
if xlabel:
plt.xlabel(xlabel);
if ylabel:
plt.ylabel(ylabel);
if title:
plt.title(title);
if ylim:
plt.ylim(ylim);
mult_boxplots(recent, 'flood_occurence', 'gdp_bin', xlabel='GDP per capita quintile')
Since we have lot of variables, we can rank them by f-value
cat = 'gdp_bin'
cat_no_bin = 'gdp_per_capita'
fps = []
for var in recent.columns.tolist():
if var != cat and var != cat_no_bin:
gb = recent[[var, cat]].dropna().groupby(cat)
f, p = scipy.stats.f_oneway(*gb[var].apply(list).values.tolist())
fps.append([var, f, p])
fps = pd.DataFrame(fps,
columns=['variable','f','p']).sort_values('f',
ascending=False)
fps.head()
n = 10
for var in fps.variable.tolist()[:n]:
mult_boxplots(recent, var, cat)
corr = recent.corr()
fig, ax = plt.subplots(figsize=(14,12));
sns.heatmap(corr, ax=ax);
plt.xlabel('');
plt.ylabel('');
plt.title('Correlation matrix heatmap');